Expression table

####################################
#Preparing the expression table
####################################

## Set working directory for first batch of data and import rsem data ##
dir = "/sc/hydra/projects/pd-omics/mynd/data/PBG-2097"
samples = read.table("/sc/hydra/projects/pd-omics/mynd/qc/samples.batch1", header = F, check.names = F)
sample = samples$V1
files = file.path(dir, sample, "Processed/RAPiD/rsem", paste0(sample, ".genes.results"))
names(files) = sample
txi.rsem = tximport(files, type = "rsem")
counts1 = data.frame(txi.rsem$counts, check.names = F)
tpms1 = data.frame(txi.rsem$abundance, check.names = F)

## Repeat for the second batch of data ##
dir = "/sc/hydra/projects/pd-omics/mynd/data/PBG-2022"
samples = read.table("/sc/hydra/projects/pd-omics/mynd/qc/samples.batch2", header = F, check.names = F)
sample = samples$V1
files = file.path(dir, sample, "Processed/RAPiD/rsem", paste0(sample, ".genes.results"))
names(files) = sample
txi.rsem1 = tximport(files, type = "rsem")
counts2 = data.frame(txi.rsem1$counts, check.names = F)
tpms2 = data.frame(txi.rsem1$abundance, check.names = F)

## We need to change sample names because of overlap between batches. We will append the batch name to the sample ID in the count matrix and then merge the two datasets ##

colnames(counts1) <- paste(colnames(counts1), "1", sep = "_")
colnames(tpms1) <- paste(colnames(tpms1), "1", sep = "_")
colnames(counts2) <- paste(colnames(counts2), "2", sep = "_")
colnames(tpms2) <- paste(colnames(tpms2), "2", sep = "_")
dim(counts1)

[1] 58929 216

[1] 58929 121

[1] 58929 216

[1] 58929 121

[1] 58929 337

[1] 58929 337

[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [57] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [71] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [99] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [113] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [127] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [141] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [155] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [183] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [197] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [225] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [239] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [253] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [267] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [281] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [295] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [309] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [323] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [337] TRUE

Normalization

##Exploratory Analysis ### Library sizes

[1] 337

Density plot

#Density plot with all samples
# cpm table = ppmi_cpm_case_control
# log cpm table = lcpm

lcounts = log2(mynd_counts)
lcpm = log2(mynd_cpm)
ltpm = log2(mynd_abundance)
# voom use log alread

nsamples <- ncol(mynd_cpm)
#samplenames <- colnames(mynd_cpm)

colfunc <- colorRampPalette(c("#4DBBD5FF", "#3C5488FF"))
col = alpha(colfunc(nsamples), alpha = 0.1)

#png(paste0(work_plots, "Density_all.#png"), width = 8, height = 6, res = 300, units = "in")
par(mfrow=c(2,2))
#COUNTS
plot(density(lcounts[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="A. COUNTS ", xlab="Log2(counts)")
#abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
  den <- density(lcounts[,i])
  lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")

#CPM
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. CPM ", xlab="Log2(CPM)")
#abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
  den <- density(lcpm[,i])
  lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")

#TPM
plot(density(ltpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="C. TPM ", xlab="Log2(TPM)")
#abline(v=ltpm.cutoff, lty=3)
for (i in 2:nsamples){
  den <- density(ltpm[,i])
  lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")

#Voom
plot(density(mynd_voom[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="D. VOOM ", xlab="voom")
#abline(v=lvoom.cutoff, lty=3)
for (i in 2:nsamples){
  den <- density(mynd_voom[,i])
  lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#dev.off()